Input

Get list of articles and of MeSH terms

# Read json with MeSH terms by paper
mesh_terms_by_article_list = read_json('./../Data/MeSH_terms_by_paper.json', simplifyVector = TRUE)

# Get list of all mesh terms
all_mesh_terms = c()
n = 0

for(mesh_terms_in_article in mesh_terms_by_article_list){
  all_mesh_terms = unique(c(all_mesh_terms, mesh_terms_in_article))
  n = c(n, length(all_mesh_terms))
}

mesh_terms_counts = data.frame('articles' = seq(1,length(n)), 'mesh_terms'=n)

print(paste0('Articles: ', length(mesh_terms_by_article_list), '       Mesh terms: ', length(all_mesh_terms)))
## [1] "Articles: 3709       Mesh terms: 10372"
ggplotly(mesh_terms_counts %>% ggplot(aes(articles, mesh_terms, color='#434343')) + geom_line() +
         geom_smooth(method='lm', color='#666666', se=FALSE, size=0.5) + theme_minimal() + theme(legend.position='none') +
         ggtitle('Increase in number of MeSH terms by each new article'))
rm(mesh_terms_in_article, n, mesh_terms_counts)

Create article-MeSH term matrix

all_articles = names(mesh_terms_by_article_list)

article_mesh_term_mat = data.frame(matrix(0, nrow=length(all_articles), ncol=sum(!is.na(all_mesh_terms))))
rownames(article_mesh_term_mat) = all_articles
colnames(article_mesh_term_mat) = all_mesh_terms[!is.na(all_mesh_terms)]

for(article in all_articles){
  article_mesh_terms = mesh_terms_by_article_list[[article]]
  if(sum(is.na(article_mesh_terms))==0){
    article_mesh_term_mat[article, article_mesh_terms] = 1
  }
}

rm(mesh_terms_by_article_list)

An important proportion of articles (almost 10%) don’t have MeSH terms

mesh_terms_by_article = data.frame('id' = rownames(article_mesh_term_mat), 'n_mesh_terms' = rowSums(article_mesh_term_mat))
bins = max(mesh_terms_by_article$n_mesh_terms)-min(mesh_terms_by_article$n_mesh_terms)+1
ggplotly(mesh_terms_by_article %>% ggplot(aes(n_mesh_terms)) + geom_histogram(bins=bins, alpha=0.6) + 
              ggtitle('Distribution of number of MeSH terms by article') + theme_minimal())

Most MeSH terms are present in only a few articles

articles_by_mesh_term = data.frame('id' = as.character(colnames(article_mesh_term_mat)),
                                   'n_articles' = colSums(article_mesh_term_mat), stringsAsFactors = FALSE)
bins = max(articles_by_mesh_term$n_articles)-min(articles_by_mesh_term$n_articles)+1
ggplotly(articles_by_mesh_term %>% ggplot(aes(n_articles)) + geom_histogram(bins=bins, alpha=0.6) + 
         scale_x_sqrt() + scale_y_sqrt() + theme_minimal() +
         ggtitle('Distribution of number of articles that contain each MeSH term'))

Many of the most popular MeSH terms aren’t useful

top_n_counts = 50
top_mesh_terms = articles_by_mesh_term %>% top_n(top_n_counts, wt=n_articles)
ggplotly(top_mesh_terms %>% ggplot(aes(reorder(id, n_articles), n_articles, fill=n_articles)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('Top ', top_n_counts, ' MeSH Terms')) + 
         xlab('') + ylab('No. Articles') + theme_minimal() + theme(legend.position='none'))
rm(article, article_mesh_terms, mesh_terms_by_article, bins)

Filters:

  • Articles that have no MeSH terms

  • MeSH terms only present in a single article

keep_articles = rowSums(article_mesh_term_mat)>0
keep_mesh_terms = colSums(article_mesh_term_mat)>1

print(paste0(sum(!keep_articles), '/', nrow(article_mesh_term_mat), ' articles have no MeSH terms'))
## [1] "363/3709 articles have no MeSH terms"
print(paste0(sum(!keep_mesh_terms), '/', ncol(article_mesh_term_mat), ' MeSH terms are mentioned in a single article'))
## [1] "6735/10371 MeSH terms are mentioned in a single article"
article_mesh_term_mat = article_mesh_term_mat[keep_articles, keep_mesh_terms]
articles_by_mesh_term = articles_by_mesh_term %>% filter(id %in% colnames(article_mesh_term_mat))

print(paste0('Articles: ', nrow(article_mesh_term_mat), '       Mesh terms: ', ncol(article_mesh_term_mat)))
## [1] "Articles: 3346       Mesh terms: 3636"
rm(keep_articles, keep_mesh_terms)
  • Try to filter out all terms that are not related to the type of experiment that was conducted
keep_terms = c('gen', 'mutation','phenotype','molecul','amino','sequence','nucleotide','polymorphism',
               'knockout','cell','allele','protein','neuron','dna','chromosome','metabolism','signal',
               'transcript','zygote','haplotype','linkage','exome','cluster','blotting','imaging',
               'etiology','biomarkers','spectrometry','case-control','syndrome','cohort','disease model',
               'polymerase','in situ','binding','model','in vitro','electroencephalography','analysis',
               'exon','haplo','codon','rna','splicing','synap','intron','technique','regulat','trait loci',
               'cpg','statistics','chromatin')
articles_by_mesh_term = articles_by_mesh_term %>% 
                        mutate('keep'=ifelse(grepl(paste(keep_terms,collapse='|'), tolower(id)), TRUE, FALSE)) %>%
                        filter(keep)

article_mesh_term_mat = article_mesh_term_mat %>% dplyr::select(articles_by_mesh_term$id)
article_mesh_term_mat = article_mesh_term_mat[rowSums(article_mesh_term_mat)>0, ]

print(paste0('Articles: ', nrow(article_mesh_term_mat), '       Mesh terms: ', ncol(article_mesh_term_mat)))
## [1] "Articles: 3320       Mesh terms: 2629"
rm(keep_terms)
top_n_counts = 50
top_mesh_terms = articles_by_mesh_term %>% top_n(top_n_counts, wt=n_articles)
ggplotly(top_mesh_terms %>% ggplot(aes(reorder(id, n_articles), n_articles, fill=n_articles)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('New top ', top_n_counts, ' MeSH Terms')) + 
         xlab('') + ylab('No. Articles') + theme_minimal() + theme(legend.position='none'))
rm(top_n_counts, top_mesh_terms)

Network:

MeSH Terms Network

sparse_article_mesh_term_mat = Matrix(as.matrix(article_mesh_term_mat), sparse=TRUE)

# EDGES
mesh_mesh_mat = t(sparse_article_mesh_term_mat) %*% sparse_article_mesh_term_mat

edges = summary(mesh_mesh_mat) %>% rename('from'=i, 'to'=j, 'count'=x) %>%
        mutate(from = colnames(article_mesh_term_mat)[from],
               to = colnames(article_mesh_term_mat)[to]) %>%
        filter(from!=to)

# Convert count to fraction
article_mesh_term_colsums = colSums(article_mesh_term_mat)
names(article_mesh_term_colsums) = colnames(article_mesh_term_mat)
edges = edges %>% mutate('weight'=count/(article_mesh_term_colsums[from]+article_mesh_term_colsums[to]))

# NODES
nodes = articles_by_mesh_term %>% mutate('id'=as.character(id), 'title'=as.character(id), 'value'=n_articles)

print(paste0('Nodes: ', nrow(nodes), '        Edges: ', nrow(edges)/2))
## [1] "Nodes: 2629        Edges: 80423"
edges %>% ggplot(aes(count, weight)) + geom_point(alpha=0.1, color='#006666') + ylab('proportion') +
          ggtitle('Effects of the change from Count to Proportion on edge weights') + theme_minimal()

graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
graph = graph %>% simplify(edge.attr.comb='first')

eigen_centrality_output = eigen_centrality(graph)
eigencentr = as.numeric(eigen_centrality_output$vector)

Clustering

comms_louvain = cluster_louvain(graph)
modules = comms_louvain %>% membership

color_pal = viridis_pal()(max(modules)) %>% substr(1,7)
graph = graph %>% set_vertex_attr('eigencentrality', value=eigencentr) %>%
                  set_vertex_attr('module', value=as.numeric(modules)) %>% 
                  set_vertex_attr('color_modules', value=color_pal[as.numeric(modules)]) %>%
                  set_vertex_attr('color', value=color_pal[as.numeric(modules)])

comm_sizes = sizes(comms_louvain) %>% data.frame %>% rename(Module=Community.sizes, size=Freq)
ggplotly(comm_sizes %>% ggplot(aes(Module, size, fill=size)) + geom_bar(stat='identity') + 
         ggtitle('Number of MeSH Terms in each module') + theme_minimal() + theme(legend.position = 'none'))
visIgraph(graph) %>% visOptions(selectedBy='module', highlightNearest=TRUE) %>% 
                     visIgraphLayout(randomSeed=123)
rm(color_pal, eigen_centrality_output, eigencentr, comm_sizes, edges, nodes)

Most important MeSH terms for modules with more than 100 nodes

module_info = tibble('module'=character(), 'top_term'=character(), size=integer())

for(i in names(sizes(comms_louvain))){
  
  module_vertices = names(modules)[modules==i]
  
  # Creat subgraph
  module_graph = induced_subgraph(graph, module_vertices)
  
  # Calculate eigencentrality
  eigen_centrality_output = eigen_centrality(module_graph)
  eigencentr = as.numeric(eigen_centrality_output$vector)
  names(eigencentr) = module_vertices
  
  module_info = module_info %>% add_row('module'=i, 'top_term'=names(eigencentr)[eigencentr==max(eigencentr)][1], 
                                        'size'=length(module_vertices))
    
  if(length(module_vertices)>100){
    
    # Plot eigencentrality of top 10 MeSH terms
    plot_data = data.frame('mesh_term'=names(eigencentr), 'eigencentrality'=eigencentr) %>%
                top_n(10, wt=eigencentrality)
    print(plot_data %>% ggplot(aes(reorder(mesh_term, eigencentrality), eigencentrality, fill=eigencentrality)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('Top terms for Module ', i)) + 
         xlab('MeSH Terms') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
  }

}

rm(i, module_vertices, module_graph, eigen_centrality_output, eigencentr, plot_data, comms_louvain)

Module Network

  • Collapsing edges: sum all original weights and divide by the product of the number of elements in both (the number of possible interactions they could have)

  • The Network plot doesn’t seem to take weights into account

module_graph = contract(graph, graph %>% get.vertex.attribute('module'), vertex.attr.comb='first') %>%
               simplify(edge.attr.comb='sum') %>% set_vertex_attr('name', value=module_info$module) %>%
                                                  set_vertex_attr('title', value=module_info$top_term) %>%
                                                  set_vertex_attr('module_size', value=module_info$size) %>%
                                                  set_vertex_attr('value', value=module_info$size)

nodes_module_graph  = module_graph %>% as_data_frame(what='vertices') %>% mutate('id'=name)
edges_module_graph = module_graph %>% as_data_frame() %>% 
                     mutate('corrected_weight' = weight/(nodes_module_graph$module_size[as.numeric(from)]*
                                                 nodes_module_graph$module_size[as.numeric(to)])*1000)

module_graph = module_graph %>% set_edge_attr('weight', value=edges_module_graph$corrected_weight)

visIgraph(module_graph) %>% visIgraphLayout(randomSeed=123) %>% visOptions(highlightNearest=TRUE)
rm(module_info, nodes_module_graph, edges_module_graph)

I think the relation between modules is clearer here

adj_mat = module_graph %>% as_adjacency_matrix(attr='weight') %>% as.matrix
adj_mat = adj_mat

heatmap.2(adj_mat, cellnote=round(adj_mat,2), dendrogram='none', notecol='gray', scale='none', trace='none',
          key=FALSE, xlab='Modules', ylab='Modules', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

rm(adj_mat)

Save file with the modules each paper belongs to through its mesh terms

mesh_term_module = graph %>% as_data_frame(what='vertices') %>% dplyr::select(name, module)

mesh_term_module_mat = matrix(0, nrow(mesh_term_module), max(mesh_term_module$module))
rownames(mesh_term_module_mat) = mesh_term_module$name
colnames(mesh_term_module_mat) = sort(unique(mesh_term_module$module))

for(i in seq(1, nrow(mesh_term_module))){
  mesh_term_module_mat[i,mesh_term_module[i,2]] = 1
}

if(!all(colnames(article_mesh_term_mat) == rownames(mesh_term_module_mat))){
  print('Row and column order dont match!')
}

article_module_mat = as.matrix(article_mesh_term_mat) %*% mesh_term_module_mat
rownames(article_module_mat) = rownames(article_mesh_term_mat)

# write.csv(article_module_mat, file='./../Data/article_module_matrix.csv')

print('Distribution of number of modules per article')
## [1] "Distribution of number of modules per article"
table(apply(article_module_mat, 1, function(x) sum(x>0)))
## 
##   1   2   3   4   5   6   7   8   9 
## 936 939 726 440 188  68  21   1   1
melt_article_module = melt(article_module_mat)
colnames(melt_article_module) = c('articleID', 'module', 'value')
melt_article_module$module = as.factor(melt_article_module$module)

ggplotly(melt_article_module %>% ggplot(aes(value, fill=module, color=module)) +
         geom_histogram(position='identity', bins=max(melt_article_module$value), alpha=0.5) + 
         facet_grid(module~., scales='free_y') + ggtitle('Module content distribution per article') + 
         xlab('n_articles') + ylab('Module content') + theme_minimal() + theme(legend.position='none'))
rm(mesh_term_module, mesh_term_module_mat, i)